MacAddict 108

home *** CD-ROM | disk | FTP | other *** search

/ MacAddict 108 / MacAddict108.iso / Software / Internet & Communication / JunkMatcher 1.5.5.dmg / JunkMatcher.app / Contents / Resources / Engine / HTMLEncoding.py < prev next >

Wrap

Python Source | 2005-06-01 | 9.9 KB | 251 lines

# # HTMLEncoding.py # JunkMatcher # # Created by Benjamin Han on 2/1/05. # Copyright (c) 2005 Benjamin Han. All rights reserved. # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. #!/usr/bin/env python # IMPORTANT: both HTMLEncodingExtractor and HTMLFormatter can have only a single instance! from sgmllib import SGMLParser import threading # to ensure thread-safety, since we will have only one # global HTMLEncodingExtractor and HTMLFormatter from consts import * from utilities import * import htmlentitydefs _metaCharsetPat = re.compile(r'charset\s*=\s*([^\s"\'>]*)') # there is no danger of reentrant locking, so we don't use RLock here _htmlEncodingExtractorLock = threading.Lock() _htmlFormatterLock = threading.Lock() class HTMLEncodingExtractor (SGMLParser): """ A SGMLParser-derived parser to extract web page encoding -------------------------------------------------------- hasTagHTML: True iff the given src has <HTML> tag hasTagHead: True iff the given src has <HEAD> tag Call exctract(src) to extract the encoding. """ def reset (self): SGMLParser.reset(self) self.encoding = None self._tagStack = [] self.hasTagHTML = self.hasTagHead = False def unknown_starttag(self, tag, attributes): if tag == 'html': self.hasTagHTML = True elif tag == 'head': self.hasTagHead = True self._tagStack.append(tag) def unknown_endtag(self, tag): if len(self._tagStack) and self._tagStack[-1] == tag: del self._tagStack[-1:] def do_meta (self, attrs): if len(self._tagStack) >= 2: if self._tagStack[-1] == 'head' and self._tagStack[-2] == 'html': self._extractEncoding(attrs) elif len(self._tagStack) == 1 and self._tagStack[-1] == 'head': self._extractEncoding(attrs) elif len(self._tagStack) == 0: self._extractEncoding(attrs) def _extractEncoding (self, attrs): attrDict=dict(attrs) httpEquiv = attrDict.get('http-equiv') if httpEquiv and httpEquiv.lower() == 'content-type': content = attrDict['content'] if content: mo = _metaCharsetPat.search(content) if mo: self.encoding = mo.group(1) self.setnomoretags() def extract (self, htmlSrc): """Extract the encoding from htmlSrc; returns the encoding (could be None)""" # thread-safety: multiple threads may call extract() and access the ivars simultaneously _htmlEncodingExtractorLock.acquire() try: self.reset() self.feed(htmlSrc) self.close() encoding = self.encoding except Exception, e: printException(u'Exception in HTMLEncodingExtractor.extract()', e) encoding = None _htmlEncodingExtractorLock.release() return encoding class HTMLFormatter (SGMLParser): """ A SGMLParser-derived parser to rewrite web page encoding into utf-8 ------------------------------------------------------------------- Call format(src) to get the modified HTML src. """ def reset(self): # extend (called by SGMLParser.__init__) SGMLParser.reset(self) self._pieces = [] self._tagStack = [] self.encoding = None self.insertCharset = 0 def unknown_starttag(self, tag, attrs): # called for each start tag # attrs is a list of (attr, value) tuples # e.g. for <pre class="screen">, tag="pre", attrs=[("class", "screen")] # Ideally we would like to reconstruct original tag and attributes, but # we may end up quoting attribute values that weren't quoted in the source # document, or we may change the type of quotes around the attribute value # (single to double quotes). # Note that improperly embedded non-HTML code (like client-side Javascript) # may be parsed incorrectly by the ancestor, causing runtime script errors. # All non-HTML code must be enclosed in HTML comment tags () # to ensure that it will pass through this parser unaltered (in handle_comment). strattrs = ''.join([' %s="%s"' % (key, value) for key, value in attrs]) self._pieces.append('<%(tag)s%(strattrs)s>' % locals()) if tag == 'html': if self.insertCharset == 2: self._pieces.append('\n<head><meta http-equiv="content-type" content="text/html; charset=utf-8"></head>') elif tag == 'head': if self.insertCharset == 3: self._pieces.append('\n<meta http-equiv="content-type" content="text/html; charset=utf-8">') self._tagStack.append(tag) def unknown_endtag(self, tag): # called for each end tag, e.g. for </pre>, tag will be "pre" # Reconstruct the original end tag. self._pieces.append('</%(tag)s>' % locals()) if len(self._tagStack) and self._tagStack[-1] == tag: del self._tagStack[-1:] def do_meta (self, attrs): if len(self._tagStack) >= 2: if self._tagStack[-1] == 'head' and self._tagStack[-2] == 'html': self._rewriteEncoding(attrs) else: self.unknown_starttag('meta', attrs) elif len(self._tagStack) == 1 and self._tagStack[-1] == 'head': self._rewriteEncoding(attrs) elif len(self._tagStack) == 0: self._rewriteEncoding(attrs) else: self.unknown_starttag('meta', attrs) def _rewriteEncoding (self, attrs): attrDict = dict(attrs) httpEquiv = attrDict.get('http-equiv') if httpEquiv and httpEquiv.lower() == 'content-type': content = attrDict['content'] if content: mo = _metaCharsetPat.search(content) if mo: self.encoding = mo.group(1) attrDict['content'] = '%sutf-8%s' % (content[:mo.start(1)], content[mo.end(1):]) strattrs = ''.join([' %s="%s"' % (key, value) for key, value in attrDict.items()]) self._pieces.append('<meta%s>' % strattrs) # NOTE we do not push 'meta' into _tagStack - cuz multiple meta would sabotage parsing! def handle_charref(self, ref): # called for each character reference, e.g. for " ", ref will be "160" # Reconstruct the original character reference. self._pieces.append('&#%(ref)s;' % locals()) def handle_entityref(self, ref): # called for each entity reference, e.g. for "©", ref will be "copy" # Reconstruct the original entity reference. self._pieces.append('&%(ref)s' % locals()) # standard HTML entities are closed with a semicolon; other entities are not if htmlentitydefs.entitydefs.has_key(ref): self._pieces.append(';') def handle_data(self, text): # called for each block of plain text, i.e. outside of any tag and # not containing any character or entity references # Store the original text verbatim. self._pieces.append(text) def handle_comment(self, text): # called for each HTML comment, e.g.  # Reconstruct the original comment. # It is especially important that the source document enclose client-side # code (like Javascript) within comments so it can pass through this # processor undisturbed; see comments in unknown_starttag for details. self._pieces.append('' % locals()) def handle_pi(self, text): # called for each processing instruction, e.g. <?instruction> # Reconstruct original processing instruction. self._pieces.append('<?%(text)s>' % locals()) def handle_decl(self, text): # called for the DOCTYPE, if present, e.g. # <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" # "http://www.w3.org/TR/html4/loose.dtd"> # Reconstruct original DOCTYPE self._pieces.append('<!%(text)s>' % locals()) def format(self, src, insertCharset, hasTagHTML, hasTagHead): """Return formatted HTML as a single string""" # thread-safety: multiple threads may call extract() and access the ivars simultaneously _htmlFormatterLock.acquire() try: self.reset() # self.insertCharset # 0: don't insert charset meta tag # 1: insert on the top # 2: insert right after <HTML> # 3: insert right after <HEAD> if insertCharset: if hasTagHead: self.insertCharset = 3 elif hasTagHTML: self.insertCharset = 2 else: self.insertCharset = 1 self.feed(src) self.close() if self.insertCharset == 1: ret = '<meta http-equiv="content-type" content="text/html; charset=utf-8">\n%s' % ''.join(self._pieces) else: ret = ''.join(self._pieces) except Exception, e: printException(u'Exception in HTMLFormatter.extract()', e) ret = '' _htmlFormatterLock.release() return ret